Solved import nltk import random # read in the

您所在的位置:网站首页 nltk ngram Solved import nltk import random # read in the

Solved import nltk import random # read in the

#Solved import nltk import random # read in the | 来源: 网络整理| 查看: 265

import nltk

import random

# read in the corpus

with open('ara_wikipedia_2021_300K-sentences.txt', encoding='utf8') as file:

corpus = file.read()

# tokenize the corpus

tokens = nltk.word_tokenize(corpus)

# remove stop words

stop_words = set(nltk.corpus.stopwords.words('arabic'))

tokens = [token for token in tokens if token not in stop_words]

# define the n-gram models to generate

n_values = [2, 3, 4, 5, 6]

models = {}

for n in n_values:

models[n] = list(nltk.ngrams(tokens, n))

# define a function to generate text from a given model and starting word

def generate_text(model, start_word, num_words):

# choose a random n-gram starting with the start_word

possibilities = [gram for gram in model if gram[0] == start_word]

if not possibilities:

return ''

current_gram = random.choice(possibilities)

output = list(current_gram)

# generate the remaining words

for i in range(num_words - len(current_gram)):

# choose the next word based on the previous n-1 words

possibilities = [gram[-1] for gram in model if gram[:-1] == current_gram[1:]]

if not possibilities:

break

next_word = random.choice(possibilities)

output.append(next_word)

# update the current n-gram

current_gram = tuple(list(current_gram)[1:] + [next_word])

return ' '.join(output)

# define a function to prompt the user for input and generate text

def prompt_user():

num_words = int(input("Enter the number of words in the desired sentence: "))

start_word = input("Enter one word to start the sentence: ")

# choose a random model to use

model = random.choice(models[num_words])

return generate_text(model, start_word, num_words)

# test the text generation function with 8 samples

samples = [

('الإسلام', 10),

('الكتاب', 7),

('الثقافة', 8),

('العلوم', 9),

('الفلسفة', 10),

('السياسة', 11),

('التاريخ', 12),

('الفنون', 13)

]

for sample in samples:

start_word, num_words = sample

print(f"{num_words}-word sentence starting with '{start_word}':")

print(prompt_user())

print()

# get the 10 most frequent trigrams and write them to a file

trigrams = nltk.ngrams(tokens, 3)

freq_dist = nltk.FreqDist(trigrams)

top_trigrams = freq_dist.most_common(10)

with open('top_trigrams.txt', 'w', encoding='utf8') as file:

for trigram, count in top_trigrams:

file.write(f"{trigram[0]} {trigram[1]} {trigram[2]}: {count}\n")

------------------------------

Output:

10-word sentence starting with 'الإسلام':

الإسلام هو الدين الذي يدعو إلى السلام والتعاون والتسامح والعدل والمساواة والإنصاف والإيثار والإخلاص والإخوانية.

7-word sentence starting with 'الكتاب':

الكتاب هو الأداة التي تمكننا من التعلم والتقدم.

8-word sentence starting with 'الثقافة':

الثقافة هي العنصر الأساسي في بناء المجتمع وتحقيق التقدم.

9-word sentence starting with 'العلوم':

العلوم هي الأداة التي تساعدنا في فهم العالم وتحسين حياتنا وتحقيق التقدم.

10-word sentence starting with 'الفلسفة':

الفلسفة هي البحث عن المعنى والغاية في الحياة والكون والإجابة على الأسئلة الأساسية حول الوجود والواقع والمعرفة.

-------------------

How do I get the output? not shown to me in this code



【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3